{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "provenance": [],
   "toc_visible": true
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# Simplified Vector Search (kNN) Implementation Guide\n"
   ],
   "metadata": {
    "id": "XU4UjiHpYdDT"
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Loading the Embedding Model\n",
    "Loading embedding model: [sentence-transformers/all-distilroberta-v1](https://huggingface.co/sentence-transformers/all-distilroberta-v1)\n",
    "\n",
    "Loading code borrowed from [elasticsearch-labs](https://www.elastic.co/search-labs) NLP text search [example notebook](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/integrations/hugging-face/loading-model-from-hugging-face.ipynb)\n"
   ],
   "metadata": {
    "id": "5lV5UN90l4YN"
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Z0TiDltHkebY"
   },
   "outputs": [],
   "source": [
    "# install packages\n",
    "!pip install -qU eland elasticsearch transformers sentence-transformers torch==1.13"
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "# import modules\n",
    "import pandas as pd, json\n",
    "from elasticsearch import Elasticsearch\n",
    "from elasticsearch.helpers import bulk\n",
    "from getpass import getpass\n",
    "from urllib.request import urlopen\n",
    "from pprint import pprint"
   ],
   "metadata": {
    "id": "Riwvd3CHO9qU"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "API_KEY = getpass(\"Elastic deployment API Key\")\n",
    "CLOUD_ID = getpass(\"Elastic deployment Cloud ID\")\n",
    "HUB_MODEL_ID = getpass(\n",
    "    \"Hugging Face Model Hub ID\"\n",
    ")  # eg sentence-transformers/all-distilroberta-v1\n",
    "\n",
    "es = Elasticsearch(cloud_id=CLOUD_ID, api_key=API_KEY)\n",
    "es.info()  # should return cluster info"
   ],
   "metadata": {
    "id": "So9bJJDVNzgF"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "!eland_import_hub_model --cloud-id $CLOUD_ID --hub-model-id $HUB_MODEL_ID --task-type text_embedding --es-api-key $API_KEY --start"
   ],
   "metadata": {
    "id": "dsFsmzZwpujb"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Ingest pipeline setup"
   ],
   "metadata": {
    "id": "71wNrH0vl4zi"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "pipeline = {\n",
    "    \"processors\": [\n",
    "        {\n",
    "            \"inference\": {\n",
    "                \"field_map\": {\"my_text\": \"text_field\"},\n",
    "                \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
    "                \"target_field\": \"ml.inference.my_vector\",\n",
    "                \"on_failure\": [\n",
    "                    {\n",
    "                        \"append\": {\n",
    "                            \"field\": \"_source._ingest.inference_errors\",\n",
    "                            \"value\": [\n",
    "                                {\n",
    "                                    \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n",
    "                                    \"pipeline\": \"ml-inference-title-vector\",\n",
    "                                    \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n",
    "                                }\n",
    "                            ],\n",
    "                        }\n",
    "                    }\n",
    "                ],\n",
    "            }\n",
    "        },\n",
    "        {\n",
    "            \"set\": {\n",
    "                \"field\": \"my_vector\",\n",
    "                \"if\": \"ctx?.ml?.inference != null && ctx.ml.inference['my_vector'] != null\",\n",
    "                \"copy_from\": \"ml.inference.my_vector.predicted_value\",\n",
    "                \"description\": \"Copy the predicted_value to 'my_vector'\",\n",
    "            }\n",
    "        },\n",
    "        {\"remove\": {\"field\": \"ml.inference.my_vector\", \"ignore_missing\": True}},\n",
    "    ]\n",
    "}\n",
    "\n",
    "pipeline_id = \"vector_embedding_demo\"\n",
    "response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)\n",
    "\n",
    "# Print the response\n",
    "print(response)"
   ],
   "metadata": {
    "id": "SL47BJNyl3-r",
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "outputId": "43588d08-9dfb-4b13-9c42-58e071cf3526"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "{'acknowledged': True}\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "<ipython-input-8-5c7708b710af>:44: DeprecationWarning: The 'body' parameter is deprecated and will be removed in a future version. Instead use individual parameters.\n",
      "  response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Index Mapping / Template setup"
   ],
   "metadata": {
    "id": "TgBeEw_Ql5I5"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "index_patterns = [\"my_vector_index-*\"]\n",
    "\n",
    "order = 1\n",
    "\n",
    "settings = {\n",
    "    \"number_of_shards\": 1,\n",
    "    \"number_of_replicas\": 1,\n",
    "    \"index.default_pipeline\": pipeline_id,\n",
    "}\n",
    "\n",
    "mappings = {\n",
    "    \"properties\": {\n",
    "        \"my_vector\": {\n",
    "            \"type\": \"dense_vector\",\n",
    "            \"dims\": 768,\n",
    "            \"index\": True,\n",
    "            \"similarity\": \"dot_product\",\n",
    "        },\n",
    "        \"my_text\": {\"type\": \"text\"},\n",
    "    },\n",
    "    \"_source\": {\"excludes\": [\"my_vector\"]},\n",
    "}\n",
    "\n",
    "\n",
    "# Create the index template\n",
    "response = es.indices.put_template(\n",
    "    name=\"my_vector_index\",\n",
    "    index_patterns=index_patterns,\n",
    "    order=order,\n",
    "    settings=settings,\n",
    "    mappings=mappings,\n",
    ")\n",
    "\n",
    "\n",
    "# Print the response\n",
    "print(response)"
   ],
   "metadata": {
    "id": "zNqjEiPZl36N",
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "outputId": "55130ac4-042f-4d65-bc4b-08c6527d85d4"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "{'acknowledged': True}\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "<ipython-input-11-40e8294e4183>:34: ElasticsearchWarning: Legacy index templates are deprecated in favor of composable templates.\n",
      "  response = es.indices.put_template(name=\"my_vector_index\",\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Indexing Data\n"
   ],
   "metadata": {
    "id": "bztQcxbll5cs"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "index_name = \"my_vector_index-01\""
   ],
   "metadata": {
    "id": "XbapSs1c-hkd"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "data = [\n",
    "    (\"Hey, careful, man, there's a beverage here!\", \"The Dude\"),\n",
    "    (\n",
    "        \"I’m The Dude. So, that’s what you call me. You know, that or, uh, His Dudeness, or, uh, Duder, or El Duderino, if you’re not into the whole brevity thing\",\n",
    "        \"The Dude\",\n",
    "    ),\n",
    "    (\n",
    "        \"You don't go out looking for a job dressed like that? On a weekday?\",\n",
    "        \"The Big Lebowski\",\n",
    "    ),\n",
    "    (\"What do you mean brought it bowling, Dude?\", \"Walter Sobchak\"),\n",
    "    (\n",
    "        \"Donny was a good bowler, and a good man. He was one of us. He was a man who loved the outdoors... and bowling, and as a surfer he explored the beaches of Southern California, from La Jolla to Leo Carrillo and... up to... Pismo\",\n",
    "        \"Walter Sobchak\",\n",
    "    ),\n",
    "]\n",
    "\n",
    "actions = [\n",
    "    {\n",
    "        \"_op_type\": \"index\",\n",
    "        \"_index\": \"my_vector_index-01\",\n",
    "        \"_source\": {\"my_text\": text, \"my_metadata\": metadata},\n",
    "    }\n",
    "    for text, metadata in data\n",
    "]\n",
    "\n",
    "bulk(es, actions)\n",
    "\n",
    "# Refresh the index to make sure all data is searchable\n",
    "es.indices.refresh(index=\"my_vector_index-01\")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bSIJ-AngVmUi",
    "outputId": "49074d6e-1d30-44e1-d565-edac0251eae1"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})"
      ]
     },
     "metadata": {},
     "execution_count": 14
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Querying Data\n"
   ],
   "metadata": {
    "id": "ENlZ3Ndjl5yl"
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "Approximate k-nearest neighbor (kNN)"
   ],
   "metadata": {
    "id": "Xk4CBDpimfDH"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "knn = {\n",
    "    \"field\": \"my_vector\",\n",
    "    \"k\": 1,\n",
    "    \"num_candidates\": 5,\n",
    "    \"query_vector_builder\": {\n",
    "        \"text_embedding\": {\n",
    "            \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
    "            \"model_text\": \"Watchout I have a drink\",\n",
    "        }\n",
    "    },\n",
    "}\n",
    "\n",
    "response = es.search(index=index_name, knn=knn, source=True)\n",
    "\n",
    "pprint(response[\"hits\"][\"hits\"])"
   ],
   "metadata": {
    "id": "xl76_rM4l3iC",
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "outputId": "9a796cf1-4beb-4405-91b9-c323db756d36"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[{'_id': 'UO5Y3IoB3ljSe18vZY6D',\n",
      "  '_index': 'my_vector_index-01',\n",
      "  '_score': 0.78170115,\n",
      "  '_source': {'ml': {'inference': {}},\n",
      "              'my_metadata': 'The Dude',\n",
      "              'my_text': \"Hey, careful, man, there's a beverage here!\"}}]\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Hybrid Searching (kNN + BM25) with RRF"
   ],
   "metadata": {
    "id": "vhefCRd-mjk8"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "query = {\"match\": {\"my_text\": \"bowling\"}}\n",
    "\n",
    "knn = {\n",
    "    \"field\": \"my_vector\",\n",
    "    \"k\": 3,\n",
    "    \"num_candidates\": 5,\n",
    "    \"query_vector_builder\": {\n",
    "        \"text_embedding\": {\n",
    "            \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
    "            \"model_text\": \"He enjoyed the game\",\n",
    "        }\n",
    "    },\n",
    "}\n",
    "\n",
    "rank: {\"rrf\": {}}\n",
    "\n",
    "fields = [\"my_text\", \"my_metadata\"]\n",
    "\n",
    "\n",
    "response = es.search(\n",
    "    index=index_name, fields=fields, knn=knn, query=query, size=2, source=False\n",
    ")\n",
    "\n",
    "pprint(response[\"hits\"][\"hits\"])"
   ],
   "metadata": {
    "id": "wLY8Q6tEmk06",
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "outputId": "dc4dd649-3a66-4084-cba1-2e0e51984037"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[{'_id': 'U-5Y3IoB3ljSe18vZY6D',\n",
      "  '_index': 'my_vector_index-01',\n",
      "  '_score': 1.8080788,\n",
      "  'fields': {'my_metadata': ['Walter Sobchak'],\n",
      "             'my_text': ['What do you mean brought it bowling, Dude?']}},\n",
      " {'_id': 'VO5Y3IoB3ljSe18vZY6D',\n",
      "  '_index': 'my_vector_index-01',\n",
      "  '_score': 1.2358729,\n",
      "  'fields': {'my_metadata': ['Walter Sobchak'],\n",
      "             'my_text': ['Donny was a good bowler, and a good man. He was one '\n",
      "                         'of us. He was a man who loved the outdoors... and '\n",
      "                         'bowling, and as a surfer he explored the beaches of '\n",
      "                         'Southern California, from La Jolla to Leo Carrillo '\n",
      "                         'and... up to... Pismo']}}]\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Filtering"
   ],
   "metadata": {
    "id": "HDBHn_kamlIL"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "knn = {\n",
    "    \"field\": \"my_vector\",\n",
    "    \"k\": 1,\n",
    "    \"num_candidates\": 5,\n",
    "    \"query_vector_builder\": {\n",
    "        \"text_embedding\": {\n",
    "            \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
    "            \"model_text\": \"Did you bring the dog?\",\n",
    "        }\n",
    "    },\n",
    "    \"filter\": {\"term\": {\"my_metadata\": \"The Dude\"}},\n",
    "}\n",
    "\n",
    "fields = [\"my_text\", \"my_metadata\"]\n",
    "\n",
    "response = es.search(index=index_name, fields=fields, knn=knn, source=False)\n",
    "\n",
    "pprint(response[\"hits\"][\"hits\"])"
   ],
   "metadata": {
    "id": "yVDMHuM3mla7",
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "outputId": "ebd848da-8ecc-4683-cb81-719f5a12f815"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[{'_id': 'UO5Y3IoB3ljSe18vZY6D',\n",
      "  '_index': 'my_vector_index-01',\n",
      "  '_score': 0.59285694,\n",
      "  'fields': {'my_metadata': ['The Dude'],\n",
      "             'my_text': [\"Hey, careful, man, there's a beverage here!\"]}}]\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Aggregrations\n",
    "and Select fields returned"
   ],
   "metadata": {
    "id": "N_Msyv4-m5ow"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "knn = {\n",
    "    \"field\": \"my_vector\",\n",
    "    \"k\": 2,\n",
    "    \"num_candidates\": 5,\n",
    "    \"query_vector_builder\": {\n",
    "        \"text_embedding\": {\n",
    "            \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
    "            \"model_text\": \"did you bring it?\",\n",
    "        }\n",
    "    },\n",
    "}\n",
    "\n",
    "aggs = {\"metadata\": {\"terms\": {\"field\": \"my_metadata\"}}}\n",
    "\n",
    "fields = [\"my_text\", \"my_metadata\"]\n",
    "\n",
    "response = es.search(index=index_name, fields=fields, aggs=aggs, knn=knn, source=False)\n",
    "\n",
    "pprint(response[\"hits\"][\"hits\"])"
   ],
   "metadata": {
    "id": "jbwinE0fm5-I",
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "outputId": "e8b02f4b-8a89-417f-a892-2e676a812a2d"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[{'_id': 'U-5Y3IoB3ljSe18vZY6D',\n",
      "  '_index': 'my_vector_index-01',\n",
      "  '_score': 0.74352247,\n",
      "  'fields': {'my_metadata': ['Walter Sobchak'],\n",
      "             'my_text': ['What do you mean brought it bowling, Dude?']}},\n",
      " {'_id': 'UO5Y3IoB3ljSe18vZY6D',\n",
      "  '_index': 'my_vector_index-01',\n",
      "  '_score': 0.6010935,\n",
      "  'fields': {'my_metadata': ['The Dude'],\n",
      "             'my_text': [\"Hey, careful, man, there's a beverage here!\"]}}]\n"
     ]
    }
   ]
  }
 ]
}
